{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 练习二 理解马尔科夫过程中的收获与价值"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "#num_states = 7\n",
    "#{\"0\": \"C1\", \"1\":\"C2\", \"2\":\"C3\", \"3\":\"Pass\", \"4\":\"Pub\", \"5\":\"FB\", \"6\":\"Sleep\"}\n",
    "i_to_n = {}\n",
    "i_to_n[\"0\"] = \"C1\"\n",
    "i_to_n[\"1\"] = \"C2\"\n",
    "i_to_n[\"2\"] = \"C3\"\n",
    "i_to_n[\"3\"] = \"Pass\"\n",
    "i_to_n[\"4\"] = \"Pub\"\n",
    "i_to_n[\"5\"] = \"FB\"\n",
    "i_to_n[\"6\"] = \"Sleep\"\n",
    "\n",
    "n_to_i = {}\n",
    "for i, name in zip(i_to_n.keys(), i_to_n.values()):\n",
    "    n_to_i[name] = int(i)\n",
    "    \n",
    "#   C1   C2   C3  Pass Pub   FB  Sleep\n",
    "Pss = [\n",
    "   [ 0.0, 0.5, 0.0, 0.0, 0.0, 0.5, 0.0 ],\n",
    "   [ 0.0, 0.0, 0.8, 0.0, 0.0, 0.0, 0.2 ],\n",
    "   [ 0.0, 0.0, 0.0, 0.6, 0.4, 0.0, 0.0 ],\n",
    "   [ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0 ],\n",
    "   [ 0.2, 0.4, 0.4, 0.0, 0.0, 0.0, 0.0 ],\n",
    "   [ 0.1, 0.0, 0.0, 0.0, 0.0, 0.9, 0.0 ],\n",
    "   [ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0 ]\n",
    "]\n",
    "Pss = np.array(Pss)\n",
    "rewards = [-2, -2, -2, 10, 1, -1, 0]\n",
    "gamma = 0.5"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "chains =[\n",
    "    [\"C1\", \"C2\", \"C3\", \"Pass\", \"Sleep\"],\n",
    "    [\"C1\", \"FB\", \"FB\", \"C1\", \"C2\", \"Sleep\"],\n",
    "    [\"C1\", \"C2\", \"C3\", \"Pub\", \"C2\", \"C3\", \"Pass\", \"Sleep\"],\n",
    "    [\"C1\", \"FB\", \"FB\", \"C1\", \"C2\", \"C3\", \"Pub\", \"C1\", \"FB\",\\\n",
    "     \"FB\", \"FB\", \"C1\", \"C2\", \"C3\", \"Pub\", \"C2\", \"Sleep\"]\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "def compute_return(start_index = 0, \n",
    "                   chain = None, \n",
    "                   gamma = 0.5) -> float:\n",
    "    '''计算一个马尔科夫奖励过程中某状态的收获值\n",
    "    Args:\n",
    "        start_index 要计算的状态在链中的位置\n",
    "        chain 要计算的马尔科夫过程\n",
    "        gamma 衰减系数\n",
    "    Returns：\n",
    "        retrn 收获值\n",
    "    '''\n",
    "    retrn, power, gamma = 0.0, 0, gamma\n",
    "    for i in range(start_index, len(chain)):\n",
    "        retrn += np.power(gamma, power) * rewards[n_to_i[chain[i]]]\n",
    "        power += 1\n",
    "    return retrn"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "-3.196044921875"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "compute_return(0, chains[3], gamma = 0.5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "-3.568359375"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "compute_return(3,chains[3])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**注意：收获与价值是不同的**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "def compute_value(Pss, rewards, gamma = 0.05):\n",
    "    '''通过求解矩阵方程的形式直接计算状态的价值\n",
    "    Args：\n",
    "        P 状态转移概率矩阵 shape(7, 7)\n",
    "        rewards 即时奖励 list \n",
    "        gamma 衰减系数\n",
    "    Return\n",
    "        values 各状态的价值\n",
    "    '''\n",
    "    #assert(gamma >= 0 and gamma < 1.0) \n",
    "    #assert(len(P.shape) == 2 and P.shape[0] == P.shape[1])\n",
    "    rewards = np.array(rewards).reshape((-1,1))\n",
    "    values = np.dot(np.linalg.inv(np.eye(7,7) - gamma * Pss), rewards)\n",
    "    return values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[-12.54296219]\n",
      " [  1.4568013 ]\n",
      " [  4.32100594]\n",
      " [ 10.        ]\n",
      " [  0.80253065]\n",
      " [-22.54274676]\n",
      " [  0.        ]]\n"
     ]
    }
   ],
   "source": [
    "values = compute_value(Pss, rewards, gamma = 0.999999)\n",
    "print(values)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 读者可以自己验证下图中某状态价值\n",
    "\n",
    "![马尔科夫奖励过程价值计算](../pics/c2_3_mrp.png)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "- - -\n",
    "## 马尔科夫决策过程\n",
    "\n",
    "![马尔科夫决策过程](../pics/c2_4_mdp.png)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "from utils import str_key, display_dict\n",
    "from utils import set_prob, set_reward, get_prob, get_reward\n",
    "from utils import set_value, set_pi, get_value, get_pi\n",
    "\n",
    "# 构建学生马尔科夫决策过程\n",
    "S = ['浏览手机中','第一节课','第二节课','第三节课','休息中']\n",
    "A = ['浏览手机','学习','离开浏览','泡吧','退出学习']\n",
    "R = {} # 奖励Rsa\n",
    "P = {} # 状态转移概率Pss'a\n",
    "gamma = 1.0 # 衰减因子\n",
    "\n",
    "set_prob(P, S[0], A[0], S[0]) # 浏览手机中 - 浏览手机 -> 浏览手机中\n",
    "set_prob(P, S[0], A[2], S[1]) # 浏览手机中 - 离开浏览 -> 第一节课\n",
    "set_prob(P, S[1], A[0], S[0]) # 第一节课 - 浏览手机 -> 浏览手机中\n",
    "set_prob(P, S[1], A[1], S[2]) # 第一节课 - 学习 -> 第二节课\n",
    "set_prob(P, S[2], A[1], S[3]) # 第二节课 - 学习 -> 第三节课\n",
    "set_prob(P, S[2], A[4], S[4]) # 第二节课 - 退出学习 -> 退出休息\n",
    "set_prob(P, S[3], A[1], S[4]) # 第三节课 - 学习 -> 退出休息\n",
    "set_prob(P, S[3], A[3], S[1], p = 0.2) # 第三节课 - 泡吧 -> 第一节课\n",
    "set_prob(P, S[3], A[3], S[2], p = 0.4) # 第三节课 - 泡吧 -> 第一节课\n",
    "set_prob(P, S[3], A[3], S[3], p = 0.4) # 第三节课 - 泡吧 -> 第一节课\n",
    "\n",
    "set_reward(R, S[0], A[0], -1) # 浏览手机中 - 浏览手机 -> -1\n",
    "set_reward(R, S[0], A[2],  0) # 浏览手机中 - 离开浏览 -> 0\n",
    "set_reward(R, S[1], A[0], -1) # 第一节课 - 浏览手机 -> -1\n",
    "set_reward(R, S[1], A[1], -2) # 第一节课 - 学习 -> -2\n",
    "set_reward(R, S[2], A[1], -2) # 第二节课 - 学习 -> -2\n",
    "set_reward(R, S[2], A[4],  0) # 第二节课 - 退出学习 -> 0\n",
    "set_reward(R, S[3], A[1], 10) # 第三节课 - 学习 -> 10\n",
    "set_reward(R, S[3], A[3], +1) # 第三节课 - 泡吧 -> -1\n",
    "\n",
    "MDP = (S, A, R, P, gamma)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "----状态转移概率字典（矩阵）信息:----\n",
      "第一节课_浏览手机_浏览手机中:　1.00\n",
      "浏览手机中_离开浏览_第一节课:　1.00\n",
      "第二节课_学习_第三节课:　1.00\n",
      "第三节课_泡吧_第一节课:　0.20\n",
      "第三节课_泡吧_第二节课:　0.40\n",
      "第三节课_泡吧_第三节课:　0.40\n",
      "第三节课_学习_休息中:　1.00\n",
      "浏览手机中_浏览手机_浏览手机中:　1.00\n",
      "第二节课_退出学习_休息中:　1.00\n",
      "第一节课_学习_第二节课:　1.00\n",
      "\n",
      "----奖励字典（函数）信息:----\n",
      "第三节课_学习:　10.00\n",
      "浏览手机中_浏览手机:　-1.00\n",
      "第一节课_浏览手机:　-1.00\n",
      "第一节课_学习:　-2.00\n",
      "第二节课_退出学习:　0.00\n",
      "浏览手机中_离开浏览:　0.00\n",
      "第二节课_学习:　-2.00\n",
      "第三节课_泡吧:　1.00\n",
      "\n"
     ]
    }
   ],
   "source": [
    "print(\"----状态转移概率字典（矩阵）信息:----\")\n",
    "display_dict(P)\n",
    "print(\"----奖励字典（函数）信息:----\")\n",
    "display_dict(R)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 策略评估\n",
    "\n",
    "![策略评估](../pics/c2_5_mdp.png)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "----状态转移概率字典（矩阵）信息:----\n",
      "第三节课_学习:　0.50\n",
      "浏览手机中_浏览手机:　0.50\n",
      "第一节课_浏览手机:　0.50\n",
      "第一节课_学习:　0.50\n",
      "第二节课_退出学习:　0.50\n",
      "浏览手机中_离开浏览:　0.50\n",
      "第二节课_学习:　0.50\n",
      "第三节课_泡吧:　0.50\n",
      "\n",
      "----状态转移概率字典（矩阵）信息:----\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# S = ['浏览手机中','第一节课','第二节课','第三节课','休息中']\n",
    "# A = ['继续浏览','学习','离开浏览','泡吧','退出学习']\n",
    "# 设置行为策略：pi(a|.) = 0.5\n",
    "Pi = {}\n",
    "set_pi(Pi, S[0], A[0], 0.5) # 浏览手机中 - 浏览手机\n",
    "set_pi(Pi, S[0], A[2], 0.5) # 浏览手机中 - 离开浏览\n",
    "set_pi(Pi, S[1], A[0], 0.5) # 第一节课 - 浏览手机\n",
    "set_pi(Pi, S[1], A[1], 0.5) # 第一节课 - 学习\n",
    "set_pi(Pi, S[2], A[1], 0.5) # 第二节课 - 学习\n",
    "set_pi(Pi, S[2], A[4], 0.5) # 第二节课 - 退出学习\n",
    "set_pi(Pi, S[3], A[1], 0.5) # 第三节课 - 学习\n",
    "set_pi(Pi, S[3], A[3], 0.5) # 第三节课 - 泡吧\n",
    "\n",
    "print(\"----状态转移概率字典（矩阵）信息:----\")\n",
    "display_dict(Pi)\n",
    "# 初始时价值为空，访问时会返回0\n",
    "print(\"----状态转移概率字典（矩阵）信息:----\")\n",
    "V = {}\n",
    "display_dict(V)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "def compute_q(MDP, V, s, a):\n",
    "    '''根据给定的MDP，价值函数V，计算状态行为对s,a的价值qsa\n",
    "    '''\n",
    "    S, A, R, P, gamma = MDP\n",
    "    q_sa = 0\n",
    "    for s_prime in S:\n",
    "        q_sa += get_prob(P, s,a,s_prime) * get_value(V, s_prime)\n",
    "    q_sa = get_reward(R, s,a) + gamma * q_sa\n",
    "    return q_sa\n",
    "\n",
    "\n",
    "def compute_v(MDP, V, Pi, s):\n",
    "    '''给定MDP下依据某一策略Pi和当前状态价值函数V计算某状态s的价值\n",
    "    '''\n",
    "    S, A, R, P, gamma = MDP\n",
    "    v_s = 0\n",
    "    for a in A:\n",
    "        v_s += get_pi(Pi, s,a) * compute_q(MDP, V, s, a)\n",
    "    return v_s\n",
    "        \n",
    "\n",
    "# 根据当前策略使用回溯法来更新状态价值，本章不做要求\n",
    "def update_V(MDP, V, Pi):\n",
    "    '''给定一个MDP和一个策略，更新该策略下的价值函数V\n",
    "    '''\n",
    "    S, _, _, _, _ = MDP\n",
    "    V_prime = V.copy()\n",
    "    for s in S:\n",
    "        #set_value(V_prime, s, V_S(MDP, V_prime, Pi, s))\n",
    "        V_prime[str_key(s)] = compute_v(MDP, V_prime, Pi, s)\n",
    "    return V_prime\n",
    "\n",
    "\n",
    "# 策略评估，得到该策略下最终的状态价值。本章不做要求\n",
    "def policy_evaluate(MDP, V, Pi, n):\n",
    "    '''使用n次迭代计算来评估一个MDP在给定策略Pi下的状态价值，初始时价值为V\n",
    "    '''\n",
    "    for i in range(n):\n",
    "        V = update_V(MDP, V, Pi)\n",
    "        #display_dict(V)\n",
    "    return V"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "浏览手机中:　-2.31\n",
      "第三节课:　7.38\n",
      "第一节课:　-1.31\n",
      "第二节课:　2.69\n",
      "休息中:　0.00\n",
      "\n",
      "第三节课在当前策略下的价值为:7.38\n"
     ]
    }
   ],
   "source": [
    "V = policy_evaluate(MDP, V, Pi, 100)\n",
    "display_dict(V)\n",
    "# 验证状态在某策略下的价值\n",
    "v = compute_v(MDP, V, Pi, \"第三节课\")\n",
    "print(\"第三节课在当前策略下的价值为:{:.2f}\".format(v))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 价值迭代得到最优状态价值过程\n",
    "def compute_v_from_max_q(MDP, V, s):\n",
    "    '''根据一个状态的下所有可能的行为价值中最大一个来确定当前状态价值\n",
    "    '''\n",
    "    S, A, R, P, gamma = MDP\n",
    "    v_s = -float('inf')\n",
    "    for a in A:\n",
    "        qsa = compute_q(MDP, V, s, a)\n",
    "        if qsa >= v_s:\n",
    "            v_s = qsa\n",
    "    return v_s\n",
    "\n",
    "def update_V_without_pi(MDP, V):\n",
    "    '''在不依赖策略的情况下直接通过后续状态的价值来更新状态价值\n",
    "    '''\n",
    "    S, _, _, _, _ = MDP\n",
    "    V_prime = V.copy()\n",
    "    for s in S:\n",
    "        set_value(V_prime, s, compute_v_from_max_q(MDP, V_prime, s))\n",
    "        #V_prime[str_key(s)] = compute_v_from_max_q(MDP, V_prime, s)\n",
    "    return V_prime\n",
    "\n",
    "# 价值迭代，本章不作要求\n",
    "def value_iterate(MDP, V, n):\n",
    "    '''价值迭代\n",
    "    '''\n",
    "    for i in range(n):\n",
    "        V = update_V_without_pi(MDP, V)\n",
    "        display_dict(V)\n",
    "    return V"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "浏览手机中:　0.00\n",
      "休息中:　0.00\n",
      "第一节课:　0.00\n",
      "第三节课:　10.00\n",
      "第二节课:　0.00\n",
      "\n",
      "浏览手机中:　0.00\n",
      "第一节课:　0.00\n",
      "第三节课:　10.00\n",
      "第二节课:　8.00\n",
      "休息中:　0.00\n",
      "\n",
      "浏览手机中:　0.00\n",
      "休息中:　0.00\n",
      "第一节课:　6.00\n",
      "第二节课:　8.00\n",
      "第三节课:　10.00\n",
      "\n",
      "浏览手机中:　6.00\n",
      "第三节课:　10.00\n",
      "第一节课:　6.00\n",
      "第二节课:　8.00\n",
      "休息中:　0.00\n",
      "\n",
      "浏览手机中:　6.00\n",
      "第三节课:　10.00\n",
      "第一节课:　6.00\n",
      "第二节课:　8.00\n",
      "休息中:　0.00\n",
      "\n",
      "在状态第三节课选择行为泡吧的最优价值为:9.40\n"
     ]
    }
   ],
   "source": [
    "V = {}\n",
    "# 通过价值迭代得到最优状态价值及\n",
    "V_star = value_iterate(MDP, V, 4)\n",
    "display_dict(V_star)\n",
    "\n",
    "# 验证最优行为价值\n",
    "s, a = \"第三节课\", \"泡吧\"\n",
    "q = compute_q(MDP, V_star, \"第三节课\", \"泡吧\")\n",
    "print(\"在状态{}选择行为{}的最优价值为:{:.2f}\".format(s,a,q))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "# display q_star\n",
    "def display_q_star(MDP, V_star):\n",
    "    S, A, _, _, _ = MDP\n",
    "    for s in S:\n",
    "        for a in A:\n",
    "            print(\"q*({},{}):{}\".format(s,a, compute_q(MDP, V_star, s, a)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "q*(浏览手机中,浏览手机):5.0\n",
      "q*(浏览手机中,学习):0.0\n",
      "q*(浏览手机中,离开浏览):6.0\n",
      "q*(浏览手机中,泡吧):0.0\n",
      "q*(浏览手机中,退出学习):0.0\n",
      "q*(第一节课,浏览手机):5.0\n",
      "q*(第一节课,学习):6.0\n",
      "q*(第一节课,离开浏览):0.0\n",
      "q*(第一节课,泡吧):0.0\n",
      "q*(第一节课,退出学习):0.0\n",
      "q*(第二节课,浏览手机):0.0\n",
      "q*(第二节课,学习):8.0\n",
      "q*(第二节课,离开浏览):0.0\n",
      "q*(第二节课,泡吧):0.0\n",
      "q*(第二节课,退出学习):0.0\n",
      "q*(第三节课,浏览手机):0.0\n",
      "q*(第三节课,学习):10.0\n",
      "q*(第三节课,离开浏览):0.0\n",
      "q*(第三节课,泡吧):9.4\n",
      "q*(第三节课,退出学习):0.0\n",
      "q*(休息中,浏览手机):0.0\n",
      "q*(休息中,学习):0.0\n",
      "q*(休息中,离开浏览):0.0\n",
      "q*(休息中,泡吧):0.0\n",
      "q*(休息中,退出学习):0.0\n"
     ]
    }
   ],
   "source": [
    "display_q_star(MDP, V_star)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
